# Bernd Beber and Alexandra Scacco
# Replication code for "What the Numbers Say: A Digit-Based Test for Election Fraud"
# Last updated October 2011
# R version 2.11.1


# To load the R data file provided for replication:
# load("Beber_Scacco_Election_Fraud_replication.RData")
# attach(swe.data)
# attach(nga.data)
# attach(nga.ward.data)
# attach(sen.2000.data)
# attach(sen.2007.data)
# attach(chi.1924.data)
# attach(chi.1928.data)
# setEPS()


#############################################################################
###### Functions for digit extraction and analysis
#############################################################################

get.digits <- function(v, pos="last") {
  # Function to extract numerals from elements in a vector.
  # Args:
  #   v: Vector with elements from which numerals are to be extracted.
  #   pos: Position of the numeral to be extracted from each vector element.
  #     Set to "last" (the default) to get the last numeral, "penult" to
  #     extract the second-to-last digit, or a numeric value.
  # Returns:
  #   A vector containing the numerals to be retrieved.
  if(!pos %in% c("last", "penult") & !is.numeric(pos)) stop("Invalid position")
  s <- strsplit(as.character(v), c())
  if(pos == "last") out <- sapply(s, function(y) y[length(y)])
  if(pos == "penult") out <- sapply(s, function(y) ifelse(length(y) < 2, NA, y[length(y) - 1]))
  if(is.numeric(pos)) out <- sapply(s, function(y) y[pos])
  return(out)
}

digit.test <- function(v, id=NULL) {
  # Function to obtain frequencies and test statistics for last digits and digit pairs.
  # Args:
  #   v: Vector with elements to be analyzed (e.g. polling station vote counts).
  #   id: Strata within which vector elements are grouped (e.g. precincts containing
  #       polling stations). By default, elements are not grouped in strata.
  # Returns:
  #   A list with the elements listed below. Elements with prefix "all." are generated only
  #   if strata are specified and there is more than one stratum.
  #     count: A matrix containing last-digit frequencies, with one row for each stratum.
  #     prop: A matrix containing last-digit proportions, with one row for each stratum.
  #     chi: A vector containing the values of chi-square tests for deviations in last-digit
  #       frequencies, with one element for each stratum.
  #     pval: A vector containing the corresponding p-values.
  #     n: A vector containing the number of observations within each stratum.
  #     id: A vector containing stratum identifiers.
  #     distance: A matrix containing frequencies of the possible distances between last and
  #       penultimate digits, with one row for each stratum.
  #     distance.prop: A matrix containing the corresponding proportions.
  #     distance.chi: A vector with values of chi-square tests for deviations in the frequencies
  #       of distances in digit pairs, with one value per stratum.
  #     distance.pval: A vector containing the corresponding p-values.
  #     all.count: A vector with last-digit frequencies across all strata.
  #     all.prop: A vector with the corresponding proportions.
  #     all.n: The number of observations across all strata, a scalar.
  #     all.chi: Value of a chi-square test for deviations in last-digit frequencies, a scalar.
  #     all.pval: The corresponding p-value.
  #     all.distance: A vector with the frequencies of distances between last and penultimate
  #       digits across all strata.
  #     all.distance.prop: A vector containing the corresponding proportions.
  #     all.distance.chi: Value of a chi-square test for deviations in digit-pair distances,
  #       a scalar.
  #     all.distance.pval: The corresponding p-value.
  if(!is.vector(v) | (!is.vector(id) & !is.null(id))) stop("Inputs must be vectors")
  if(!identical(length(v), length(id)) & !is.null(id)) stop("Input vectors vary in length")
  if(is.null(id)) id <- rep(1, length(v))
  id.uni <- unique(id)
  count <- matrix(ncol=10, nrow=length(id.uni), dimnames=list(NULL, c(0:9)))
  distance <- matrix(ncol=6, nrow=length(id.uni), dimnames=list(NULL, c(0:5)))
  id.out <- c()
  for(i in 1:length(id.uni)){
    # Last-digit frequencies
    last <- get.digits(v[id == id.uni[i]], pos="last")
    freq.last <- table(as.numeric(last))
    count[i, dimnames(freq.last)[[1]]] <- freq.last
    if(sum(is.na(count[i, ])) < 10) count[i, is.na(count[i, ])] <- 0
    # Distance between last and penultimate digits
    penult <- get.digits(v[id == id.uni[i]], pos="penult")
    tolast <- abs(as.numeric(last) - as.numeric(penult))
    tolast[tolast > 5 & !is.na(tolast)] <- 10 - tolast[tolast > 5 & !is.na(tolast)]
    tolast <- table(tolast)
    distance[i, dimnames(tolast)[[1]]] <- tolast
    if(sum(is.na(distance[i, ])) < 6) distance[i, is.na(distance[i, ])] <- 0
    id.out <- c(id.out, id.uni[i])
  }
  # Compute proportions and test statistics
  prop <- t(apply(count, 1, function(x) x / sum(x)))
  n <- apply(count, 1, function(x) sum(x))
  chi <- apply(count, 1, function(x) sum((x - .1 * sum(x))^2 / (.1 * sum(x))))
  pval <- 1 - pchisq(chi, 9)
  distance.prop <- t(apply(distance, 1, function(x) x / sum(x)))
  expect <- c(.1, .2, .2, .2, .2, .1)
  distance.chi <- apply(distance, 1, function(x) sum((x - expect * sum(x))^2 / (expect * sum(x))))
  distance.pval <- 1 - pchisq(distance.chi, 5)
  # If there are multiple strata, compute proportions and test statistics across all of them
  if(length(id.uni) > 1) {
    all.count <- apply(count, 2, sum)
    all.prop <- all.count / sum(n)
    all.n <- sum(all.count)
    all.chi <- sum((all.count - .1 * all.n)^2 / (.1 * all.n))
    all.pval <- 1 - pchisq(all.chi, 9)
    all.distance <- apply(distance, 2, sum)
    all.distance.prop <- all.distance / sum(n)
    all.distance.chi <- sum((all.distance - expect * all.n)^2 / (expect * all.n))
    all.distance.pval <- 1 - pchisq(all.distance.chi, 5)
    out <- list(count=count, prop=prop, chi=chi, pval=pval, n=n, id=id.out,
                distance=distance, distance.prop=distance.prop, distance.chi=distance.chi,
                distance.pval=distance.pval, all.count=all.count, all.prop=all.prop,
                all.n=all.n, all.chi=all.chi, all.pval=all.pval, all.distance=all.distance,
                all.distance.prop=all.distance.prop, all.distance.chi=all.distance.chi,
                all.distance.pval=all.distance.pval)
  } else {
    out <- list(count=count, prop=prop, chi=chi, pval=pval, n=n, id=id.out,
                distance=distance, distance.prop=distance.prop,
                distance.chi=distance.chi, distance.pval=distance.pval)
  }
  return(out)
}


#############################################################################
###### Simulations of last-digit frequencies for various distributions
#############################################################################

# Re-parameterize various distributions in terms of first two moments.
# Distributions are truncated from below at and not including 0.
# Mean and standard deviation are moments of non-truncated distribution.
# Args for the following set of functions:
#   n: Number of observations.
#   mu: Mean of distribution from which observations are drawn.
#   sd: Standard deviation of distribution from which observations are drawn.
# Return:
#   A vector containing draws from the relevant distribution.

# Binomial distribution, which requires sd^2 < mu
rbinom2 <- function(n, mu, sd) {
    out <- rbinom(n, size = round(mu^2 / (mu - sd^2)), prob = 1 - (sd^2 / mu))
    repeat {
        out[out==0] <- rbinom(sum(out==0), size = round(mu^2 / (mu - sd^2)), prob = 1 - (sd^2 / mu))
        if(sum(out==0)==0) break 
    }
    return(out)
}

# Negative binomial distribution, which requires sd^2 > mu
rnbinom2 <- function(n, mu, sd) {
    out <- rnbinom(n, size = mu^2 / (sd^2 - mu), prob = mu / sd^2)
    repeat {
        out[out==0] <- rnbinom(sum(out==0), size = mu^2 / (sd^2 - mu), prob = mu / sd^2)
        if(sum(out==0)==0) break 
    }
    return(out)
}

# Poisson distribution, which requires sd^2 = mu
rpois2 <- function(n, mu) {
    out <- rpois(n, lambda=mu)
    repeat {
        out[out==0] <- rpois(sum(out==0), lambda=mu)
        if(sum(out==0)==0) break 
    }
    return(out)
}

# Discretized normal distribution
rnorm2 <- function(n, mu, sd) {
    out <- rnorm(n=n, mean=mu, sd=sd)
    repeat {
        out[out<=.5] <- rnorm(sum(out<=.5), mean=mu, sd=sd)
        if(sum(out<=.5)==0) break 
    }
    round(out)
}

# Discretized uniform distribution
runif2 <- function(n, mu, sd) {
  out <- runif(n, min = mu - sqrt(3) * sd, max = mu + sqrt(3) * sd)
  repeat {
    out[out<=.5] <- runif(sum(out<=.5), min = mu - sqrt(3) * sd, max = mu + sqrt(3) * sd)
    if(sum(out<=.5)==0) break 
  }
  round(out)
}

# Discretized gamma distribution
rgamma2 <- function(n, mu, sd) {
  out <- rgamma(n, shape = mu^2 / sd^2, rate = mu / sd^2)
  repeat {
    out[out<=.5] <- rgamma(sum(out<=.5), shape = mu^2 / sd^2, rate = mu / sd^2)
    if(sum(out<=.5)==0) break 
  }
  round(out)
}

# Discretized mechanism A from Mebane (2006)
rmechA <- function(n, mu, sd) {
  tmp <- find.par.A(mu, sd)
  out <- mechA(size=tmp[1], nprecincts=n, mf=tmp[2], lgp=tmp[3], hgp=tmp[4], lb=tmp[5], ha=tmp[6])
  repeat {
    if(all(is.na(out))) break 
    out[out<=0] <- mechA(size=tmp[1], nprecincts=sum(out<=0), mf=tmp[2], lgp=tmp[3], hgp=tmp[4], lb=tmp[5], ha=tmp[6])
    if(sum(out<=0)==0) break 
  } 
  return(out)
}

# Discretized mechanism B from Mebane (2006)
rmechB <- function(n, mu, sd) {
  tmp <- find.par.B(mu, sd)
  out <- mechB(size=tmp[1], nprecincts=n, mf=tmp[2], onen=tmp[3], twon=tmp[4], onev=tmp[5], twov=tmp[6])
  repeat {
    if(all(is.na(out))) break 
    out[out<=0] <- mechB(size=tmp[1], nprecincts=sum(out<=0), mf=tmp[2], onen=tmp[3], twon=tmp[4], onev=tmp[5], twov=tmp[6])
    if(sum(out<=0)==0) break 
  }
  return(out)
}

  # Define workhorse functions for rmechA and rmechB

    # Random generation functions as defined by Mebane (2006)
    mechA <- function(size, nprecincts=1000, mf=1/3, lgp=1, hgp=1, lb=4, ha=4) {
      lgb <- exp(lgp) / (exp(lgp) + exp(hgp) + 1)
      hgb <- exp(hgp) / (exp(lgp) + exp(hgp) + 1)
      mgb <- 1 / (exp(lgp) + exp(hgp) + 1)
      sapply(1:nprecincts, function(x) {
        p3 <- c(rbeta(1, 1/2, lb), mf, rbeta(1, ha, 1/2))
        q <- runif(1, 0, 1)
        pf <- c(q * lgb, mgb, (1 - q) * hgb)
        trunc(sum(size * p3 * pf / sum(pf)))
      })
    }

    mechB <- function(size, nprecincts=1000, mf=1/3, onen=1, twon=1, onev=1, twov=1) {
      sapply(1:nprecincts, function(x) {
        p3 <- c(0, mf, 1)
        onex <- ifelse(is.na(onen) & is.na(onev), NA, rnorm(1, onen, onev))
        twox <- ifelse(is.na(twon) & is.na(twov), NA, rnorm(1, twon, twov))
        pf <- c(exp(onex), 1, exp(twox)) / (exp(onex) + exp(twox) + 1)
        q <- runif(1, 0, 1)
        trunc(c(q * size * sum(p3 * pf)))
      })
    }

    # Search for parameters that correspond to given mean and standard deviation
    sizesA <- seq(10, 790, by=60)
    mfs <- seq(0, 1, by=.11)
    lgps <- seq(-5, 13, by=3)
    hgps <- seq(-5, 13, by=3)
    lbs <- seq(1, 16, by=3)
    has <- seq(1, 16, by=3)

    sizesB <- seq(10, 1570, by=120)
    onens <- seq(-5, 5, by=2)
    twons <- seq(-5, 5, by=2)
    onevs <- seq(1, 9, by=2)
    twovs <- seq(1, 9, by=2)

    set.seed(26022011)
    simA <- array(NA, c(length(sizesA), length(mfs), length(lgps), length(hgps),
                  length(lbs), length(has), 2))
    for(i in 1:length(sizesA)) {
      for(j in 1:length(mfs)) {
        for(k in 1:length(lgps)) {
          for(l in 1:length(hgps)) {
            for(m in 1:length(lbs)) {
              for(n in 1:length(has)) {
                tmp <- mechA(size=sizesA[i], mf=mfs[j], lgp=lgps[k], hgp=hgps[l],
                             lb=lbs[m], ha=has[n])
                simA[i, j, k, l, m, n, 1] <- mean(tmp)
                simA[i, j, k, l, m, n, 2] <- sd(tmp)
              }
            }
          }
        }
      }
      print(paste(round(100 * i / length(sizesA)), "% done", sep=""))
      flush.console()
    }

    set.seed(26022011)
    simB <- array(NA, c(length(sizesB), length(mfs), length(onens), length(twons),
                  length(onevs), length(twovs), 2))
    for(i in 1:length(sizesB)) {
      for(j in 1:length(mfs)) {
        for(k in 1:length(onens)) {
          for(l in 1:length(twons)) {
            for(m in 1:length(onevs)) {
              for(n in 1:length(twovs)) {
                tmp <- mechB(size=sizesB[i], mf=mfs[j], onen=onens[k], twon=twons[l],
                             onev=onevs[m], twov=twovs[n])
                simB[i, j, k, l, m, n, 1] <- mean(tmp)
                simB[i, j, k, l, m, n, 2] <- sd(tmp)
              }
            }
          }
        }
      }
      print(paste(round(100 * i / length(sizesB)), "% done", sep=""))
      flush.console()
    }

    # Use functions find.par.A and find.par.B to retrieve parameters
    find.par.A <- function(mu, sd, tol=100) {
      # Available matches for means and standard deviations:
      #   plot(simA[, , , , , , 1], simA[, , , , , , 2])
      to.min <- (simA[, , , , , , 1] - mu)^2 + (simA[, , , , , , 2] - sd)^2
      # Select only reasonable matches
      to.sel <- min(to.min)
      if(to.sel > tol) {
        out <- rep(NA, 6)
      } else {
        idx <- which(to.min == to.sel, arr.ind=T)
        out <- c(sizesA[idx[1]], mfs[idx[2]], lgps[idx[3]], hgps[idx[4]], lbs[idx[5]], has[idx[6]])
      }
      return(out)
    }

    find.par.B <- function(mu, sd, tol=100) {
      to.min <- (simB[, , , , , , 1] - mu)^2 + (simB[, , , , , , 2] - sd)^2
      to.sel <- min(to.min)
      if(to.sel > tol) {
        out <- rep(NA, 6)
      } else {
        idx <- which(to.min == to.sel, arr.ind=T)
        out <- c(sizesB[idx[1]], mfs[idx[2]], onens[idx[3]], twons[idx[4]], onevs[idx[5]], twovs[idx[6]])
      }
      return(out)
    }

digit.simulations <- function(mus=seq(1, 200, length=20),
                              sds=seq(1, 100, length=20),
                              nsims=1000, level=.05) {
  # Function to simulate draws from distributions specified above and evaluate whether
  # last-digit numerals occur with equal frequency for a given distribution, mean, and variance.
  # Args:
  #   mus: A vector of means for distributions. Defaults to 20 means between 1 and 100.
  #   sds: A vector of standard deviations. Defaults to 20 values between 1 and 50.
  #   nsims: Number of draws for each combination of distribution, mean, and standard deviation.
  #     Defaults to 1000.
  #   level: Significance level for chi-square test of equifrequent last digits. Default is .05.
  # Returns:
  #   A logical array indicating whether simulations show that last digits do not occur
  #   equifrequently for a given distribution, mean, and standard deviation. The array has
  #   three dimensions, where the first index is for means, the second for standard deviations,
  #   and the third for the following distributions (in this order): Binomial/negative binomial/
  #   Poisson, discretized and truncated normal, discretized and truncated uniform, discretized
  #   Gamma, mechanism A and mechanism B from Mebane (2006).
  out <- array(NA, c(length(mus), length(sds), 6))
  for(i in 1:length(mus)) {
    for(j in 1:length(sds)) {
      if(mus[i] > sds[j]^2) out[i, j, 1] <- digit.test(rbinom2(nsims, mus[i], sds[j]))$pval
      if(mus[i] < sds[j]^2) out[i, j, 1] <- digit.test(rnbinom2(nsims, mus[i], sds[j]))$pval
      if(mus[i] == sds[j]^2) out[i, j, 1] <- digit.test(rpois2(nsims, mus[i]))$pval
      out[i, j, 2] <- digit.test(rnorm2(nsims, mus[i], sds[j]))$pval
      out[i, j, 3] <- digit.test(runif2(nsims, mus[i], sds[j]))$pval
      out[i, j, 4] <- digit.test(rgamma2(nsims, mus[i], sds[j]))$pval
      out[i, j, 5] <- digit.test(rmechA(nsims, mus[i], sds[j]))$pval
      out[i, j, 6] <- digit.test(rmechB(nsims, mus[i], sds[j]))$pval
      }
    }
    return(out <= level)
}

# Run simulations for three repetitions
# Since simulations are run over a 20 x 20 grid of means and standard deviations, we test 400 hypotheses for each distribution
# We want the probability of getting a significant result by chance to be .05 for each distribution, not each hypothesis test
# So for each hypothesis test, this probability has to be .05 / 400 = .000125
# Therefore we require a significant result three times in a row, since .05^3 = .000125
set.seed(26022011)
sims.out.1 <- digit.simulations()
sims.out.2 <- digit.simulations()
sims.out.3 <- digit.simulations()

# Reject null hypothesis of equifrequent last digits if distribution fails all runs
sims.out <- (sims.out.1 + sims.out.2 + sims.out.3) == 3

# Prepare array for graph
sims.colors <- replace(sims.out, which(sims.out==TRUE, arr.ind=T), "black")
sims.colors <- replace(sims.colors, which(sims.out=="FALSE", arr.ind=T), "grey")
sims.colors <- replace(sims.colors, which(is.na(sims.out), arr.ind=T), "white")

# Function to plot output array from simulations
sims.dots <- function(color.matrix, xs=seq(1, 200, length=20), ys=seq(1, 100, length=20), ...){
  omgp <- par("mgp")
  par(mgp=c(2, .3, 0))
  plot(NA, xlim=range(xs), ylim=range(ys), xaxt="n", yaxt="n", ...)
  axis(2, seq((min(ys) %/% 10) * 10, (max(ys) %/% 10) * 10, by=10),
       labels=seq((min(ys) %/% 10) * 10, (max(ys) %/% 10) * 10, by=10), las=1, tick=F)
  axis(1, seq((min(xs) %/% 10) * 10, (max(xs) %/% 10) * 10, by=10),
       labels=seq((min(xs) %/% 10) * 10, (max(xs) %/% 10) * 10, by=10), tick=F)
  for(i in 1:length(xs)) {
    for(j in 1:length(ys)) {
      if(color.matrix[i, j] == "black") {
        points(xs[i], ys[j], col=color.matrix[i, j], pch=4, cex=1, lwd=3.5)
      } else {
        points(xs[i], ys[j], col=color.matrix[i, j], pch=19, cex=1.7)
      }
    }
  }
  par(mgp=omgp)
}

# Create graph of output from simulations
postscript("Distributions.eps", width=8, height=10, bg="white", paper="special", horizontal=F, family="ComputerModern", encoding="TeXtext.enc")
  par(mfrow=c(3,2))
  par(mar=c(3,3.5,4,1))
  sims.dots(sims.colors[, , 1], ylab=expression(paste("Standard deviation ",sigma)), xlab=expression(paste("Mean ",mu)))
  text(100, 118, expression(paste("Binomial (for ", sigma^2<mu, "), negative binomial (for ", sigma^2>mu, "),")), xpd=NA, cex=1.1)
  text(100, 110, expression(paste("and Poisson (for ", sigma^2==mu, ") distributions")), xpd=NA, cex=1.1)
  sims.dots(sims.colors[, , 2], ylab="", xlab="")
  text(100, 114, expression(paste("Discretized normal distribution")), xpd=NA, cex=1.1)
  sims.dots(sims.colors[, , 3], ylab="", xlab="")
  text(100, 114, expression(paste("Discretized uniform distribution")), xpd=NA, cex=1.1)
  sims.dots(sims.colors[, , 4], ylab="", xlab="")
  text(100, 114, expression(paste("Discretized Gamma distribution")), xpd=NA, cex=1.1)
  sims.dots(sims.colors[, , 5], ylab="", xlab="")
  text(100, 118, expression(paste("Discretized mix of distributions")), xpd=NA, cex=1.1)
  text(100, 110, expression(paste("(Mechanism A, Mebane 2006)")), xpd=NA, cex=1.1)
  sims.dots(sims.colors[, , 6], ylab="", xlab="")
  text(100, 118, expression(paste("Discretized mix of distributions")), xpd=NA, cex=1.1)
  text(100, 110, expression(paste("(Mechanism B, Mebane 2006)")), xpd=NA, cex=1.1)
  par(mfrow=c(1,1))
  par(mar=c(5,4,4,2) + 0.1)
dev.off()


#############################################################################
###### Functions for graphs
#############################################################################

# Function to wrap text in graphs
wrap <- function(text.vector, char.per.line=20) {
  as.vector(sapply(text.vector, function(x) paste(strwrap(x, width=char.per.line), collapse="\n")))
}

# Function to create vector of length l with values around scalar s, each d away from its neighbor but no larger than max or smaller than min
shift <- function(s, l, d, min=-Inf, max=Inf) {
  if(l %% 2 == 0) out <- sort(s + d * rep(0:l, each=2)[2:(l+1)] * (-1)^(1:l) - d/2) else
                  out <- sort(s + d * rep(0:l, each=2)[2:(l+1)] * (-1)^(1:l))
  if(min(out) < min & max(out) > max) out <- rep(NA, l) else
    if(min(out) < min) out <- min + 0:(l - 1) * d else
      if(max(out) > max) out <- sort(max - 0:(l - 1) * d)
  return(out)
}

# Function to shift overlapping points (an alternative to jitter)
spread <- function(x, tol=.1, min=-Inf, max=Inf) {
  if(length(x)==1) out <- x else {
    if(1 + (max - min) / tol < length(x)) out <- rep(NA, length(x)) else {
      o <- (1:length(x))[order(x)]
      x <- x[order(x)]
      d <- x[2:length(x)] - x[1:(length(x) - 1)] 
      b <- c(1, 1)
      skip <- 0
      for (i in 1:length(d)) {
        if (d[i] <= tol) next
        repeat {
          if(length(b)==1) break
          check <- x[b[length(b)]:i]
          check.s <- shift(median(check), length(check), tol, min=min, max=max)
          if(max(check.s) > x[i + 1] - tol) {
            skip <- i
            break }
          prev <- x[b[length(b) - 1]:(b[length(b)] - 1)]
          ifelse(min(check.s) < max(shift(median(prev), length(prev), tol, min=min, max=max)) + tol, b <- b[1:(length(b) - 1)], break) }
        if(skip != i) b <- c(b, i + 1) }
      b <- c(b, length(x) + 1)
      out <- c()
      for (i in 1:(length(b) - 1)) {
        g <- x[b[i]:(b[i + 1] - 1)]
        s <- shift(median(g), length(g), tol, min=min, max=max)
        out <- c(out, s[order(s)]) }
      out <- out[order(o)] } }
  return(out)
}

# Function to shift overlapping points in two-space (but without check that scattered points don't create new overlap)
scatter <- function(x, y=NULL, d=.1, tol=.001, check=c("x", "y"), move="x", min=-Inf, max=Inf) {
  mat <- cbind(eval(parse(text=check[1])), eval(parse(text=check[2])))
  distmat <- as.matrix(dist(mat))
  pts <- 1:nrow(distmat)
  while(length(pts)>0) {
    now <- min(pts)
    dup <- which(distmat[now, ]<tol)
    pts <- pts[!pts %in% dup]
    if(move=="y") y[dup] <- y[dup] + shift(0, length(dup), d=d, min=min, max=max) else
                            x[dup]<- x[dup] + shift(0, length(dup), d=d, min=min, max=max)
  }
  return(cbind(x, y))
}

# Function to compress values in v within range from exactly l to h
compress <- function(v, l=0, h=1)  (v - min(v)) * (h-l)/(max(v)-min(v)) + l

# Function to capitalize words, adapted from help menu for chartr
capwords <- function(s, hyphen=T) {
  cap <- function(s) paste(toupper(substring(s, 1, 1)), substring(s, 2), sep="", collapse=" ")
  out <- sapply(strsplit(s, split=" "), cap, USE.NAMES = !is.null(names(s)))
  if(hyphen) {
    cap <- function(s) paste(toupper(substring(s, 1, 1)), substring(s, 2), sep="", collapse="-")
    out <- sapply(strsplit(out, split="-"), cap, USE.NAMES = !is.null(names(out)))
  }
  return(out)
}

# Function to round and dispose of leading 0 and to return character for display
zap <- function(v, digits=0) {
  sapply(v, function(x) {
    o <- as.character(round(x,digits))
    if(substr(o,1,2)=="0.") o <- substring(o,2)
    if(substr(o,1,3)=="-0.") o <- paste("-", substring(o,3), sep="")
    return(o)
  } )
}

# Function to plot proportions of last digits
prop.plot <- function(PROP, MAIN=NULL, SUB=NULL, ci.u, ci.l, yrange=c(0, .22)) {
    omgp <- par("mgp")
    par(mgp=c(3, .3, 0))
    plot(NA, xlim=c(-.5, 9.5), ylim=yrange, yaxs="i", xaxt="n", yaxt="n", main=MAIN, sub=SUB, ylab="", xlab="")
    abline(h=ci.u, lty=2)
    abline(h=ci.l, lty=2)
    abline(h=.1, lty="22")
    axis(2, c(yrange[1], ci.l, .1, ci.u, yrange[2]),
         labels=c(zap(yrange[1],2), zap(ci.l,2), "Mean", zap(ci.u,2), zap(yrange[2],2)), las=1, tick=F)
    axis(1, 0:9, 0:9, tick=F)
    for(k in 1:length(PROP)){
      if(PROP[k]>ci.u | PROP[k]<ci.l) lines(rep(names(PROP)[k], 2), c(-1, PROP[k]), col="black", lwd=10, lend=1)
      else lines(rep(names(PROP)[k], 2), c(-1, PROP[k]), col="grey", lwd=10, lend=1)
    }
    box()
    par(mgp=omgp)
}

# Function to plot proportions of last digits, by strata
# Confidence bounds here test whether any single digit appears particularly often
# DIGITS is an object returned by function digit.test
prop.plot.many <- function(DIGITS, MAIN, highlight=NULL, highlight.col="black", arrange=c(7, 10)) {
  omfrow <- par("mfrow"); omar <- par("mar"); omgp <- par("mgp")
  # Put multiple plots on device
  par(mfrow=arrange)
  par(mar=c(.8, 1.5, .6, .3))
  par(mgp=c(3, .3, 0))
  # Set up initial plots for title
  plot(NA, xlim=c(0,1), ylim=c(0,1), yaxt="n", xaxt="n", bty="n")
  text(0, .5, MAIN, adj=c(0, .5), xpd=NA, cex=1.2, font=2)
  plot.new()
  plot.new()
  # Loop through wards, sort by size
  toloop <- (1:length(DIGITS$id))[order(DIGITS$n)]
  cr <- 1
  for(i in toloop) {
    prop <- DIGITS$prop[i, ]
    n <- DIGITS$n[i]
    ci.u <- ci["Proportion, 95%", ci[1, ]==n]
    if(sum(is.na(prop))==10) next
    plot(NA, xlim=c(-.5, 9.5), ylim=c(0, 1), xaxt="n", yaxt="n")
    abline(h=ci.u, lty=2)
    if(cr==1) axis(2, c(0, ci.u, 1), labels=c("0", zap(ci.u, 1), "1"), tick=F, las=1, cex.axis=.9)
    else axis(2, ci.u, labels=zap(ci.u, 1), las=1, tick=F, cex.axis=.9)
    par(mgp=c(3, .1, 0))
    if(cr<(prod(arrange) - arrange[2] - 2)) axis(1, c(0, 2, 4, 6, 8), c(0, 2, 4, 6, 8), tick=F, cex.axis=.9)
    par(mgp=c(3, .3, 0))
    cr <- cr + 1
    for(k in 1:length(prop)) {
      if(prop[k]==0) next
      if(prop[k]>ci.u & ci.u<1) lines(rep(names(prop)[k], 2), c(-1, prop[k]), col="black", lwd=3)
      else lines(rep(names(prop)[k], 2), c(-1, prop[k]), col="grey", lwd=3)
    }
    if(DIGITS$id[i] %in% highlight) box(col=highlight.col, lwd=3) else box()
  }
  par(mfrow=omfrow); par(mar=omar); par(mgp=omgp)
}

# Function to plot the distance between the largest of each strata's last-digit proportions and the upper limit of the relevant 95% confidence interval
# DIGITS is an object returned by digit.test
# LABELS is a two-column matrix, with ID codes in the first column and names in the second column
# MAIN and other options define plot characteristics
prop.plot.all <- function(DIGITS, LABELS, bw=T, lab.thold=0, lab.dist=.13, spread.tol=.006) {
  omar <- par("mar"); omgp <- par("mgp")
  par(mar=c(4, 4, 1, 1))
  par(mgp=c(2.5, .5, 0))
  # Get relevant statistic
  ci.u <- ci["Proportion, 95%", sapply(DIGITS$n, function(x) which(ci[1, ]==x))]
  dis <- apply(DIGITS$prop, 2, function(x) x - ci.u)
  maxval <- apply(dis, 1, "max")
  maxlab <- apply(dis, 1, function(x) names(x)[which(x==max(x))])
  plot(NA, xlim=c(0, 9), ylim=c(min(maxval), max(maxval)), xaxt="n", yaxt="n", cex.axis=.9,
       xlab="Most frequent last digit on ward return sheet",
       ylab="Digit proportion in excess of 95% confidence interval")
  axis(1, 0:9)
  axis(2, axTicks(2), zap(axTicks(2), 2), las=1, tick=F)
  abline(h=0, lty=2)
  xval <- sapply(maxlab, function(x) max(as.numeric(x)))
  xydots <- scatter(xval, maxval, d=.07)
  xytext <- cbind(xval, maxval)
  cexval <- log(DIGITS$n)
  # Scale values for size of points to be between .5 and 3
  cexval <- compress(cexval, .5, 3)
  # Color by reported turnout
  colval <- round(ward.turnout[2, which(ward.turnout[1, ]==DIGITS$id)] * 100)
  colval <- colval - min(colval) + 1
  ifelse(bw, cols <- gray(seq(0, 5/6, length=max(colval))),
             cols <- rainbow(max(colval), start=0, end=2/6))
  if(!bw) cols <- cols[length(cols):1]
  # Parameters 'start' and 'end' specify subranges:
  # red=0, yellow=1/6, green=2/6, cyan=3/6, blue=4/6, magenta=5/6
  # Loop through wards, plot large ones first
  toloop <- 1:nrow(xydots)
  toloop <- toloop[order(DIGITS$n, decreasing=T)]
  for(i in toloop) {
      points(xydots[i,1], xydots[i,2], col=cols[colval[i]], pch=19, cex=cexval[i])
  }
  # Add turnout and point legend
  xcols <- seq(7, 9, l=length(cols))
  for(i in 1:length(cols)) {
    lines(rep(xcols[i], 2), c(max(maxval) - (.05 * (max(maxval) - min(maxval))), max(maxval)),
          col=cols[i], lwd=4, lend=2)
  }
  colslab <- round(ward.turnout[2, ] * 100)
  text(c(7, 8, 9), c(max(maxval) - (.06 * (max(maxval) - min(maxval)))),
       c(paste(min(colslab), "%", sep=""), "Turnout", paste(max(colslab), "%", sep="")),
       cex=.7, adj=c(.5, 1))
  text(8, c(max(maxval) - (.1 * (max(maxval) - min(maxval)))), "Point size is proportional to\nnumber of polling stations", cex=.7, adj=c(.5, 1))
  # Identify wards
  lab <- LABELS[LABELS[, 1] %in% DIGITS$id[xydots[, "y"] > lab.thold], ]
  if(nrow(lab)==0) stop
  lab <- unique(lab)
  xytext <- xytext[DIGITS$id %in% lab[, 1], ]
  for(i in unique(xytext[, 1])) xytext[xytext[, 1]==i, 2] <- spread(xytext[xytext[, 1]==i, 2], tol=spread.tol, min=spread.tol, max=max(maxval))
  h <- xytext[, 1] > 5
  if(sum(h) > 0) text(xytext[h, 1] - lab.dist, xytext[h, 2], lab[h, 2], cex=.7, adj=c(1, .5))
  if(sum(!h) > 0) text(xytext[!h, 1] + lab.dist, xytext[!h, 2], lab[!h, 2], cex=.7, adj=c(0, .5))
  box()
  # reset plot parameters
  par(mar=omar); par(mgp=omgp)
}

# Function to plot distribution of p-values from a set of chi-squared tests of equifrequent last digits
pval.plot <- function(PVALS, yrange=c(0, 3), bins=c(0, .05, .1, .2, .5, 1), col=c("black", "grey75", "grey95", rep("white", 3)), ylab="Density", xlab="P-values", main="") {
    omgp <- par("mgp"); omar <- par("mar")
    par(mar=c(3.5, 4.5, 4, 1))
    par(mgp=c(3, .3, 0))
    hist(PVALS, breaks=bins, col=col, ylim=yrange, xaxt="n", yaxt="n", ylab=ylab, xlab="", main=main)
    abline(h=1, lty=2, xlim=c(0,1))
    axis(2, axis(2, labels=F, tick=F), zap(axis(2, labels=F, tick=F), 1), las=1, tick=F)
    axis(1, bins, bins, tick=F)
    par(mgp=c(2, .3, 0))
    title(xlab=xlab)
    box()
    par(mar=omar); par(mgp=omgp)
}

# Function to plot proportions for particular distances between last and penultimate digits (e.g. for polling station returns), by strata (e.g. ward)
# DIGITS is an object returned by digit.test
# WHICH must be 0 (same digit), 1 (distance of one), or 2 (distance greater than 1)
distance.plot <- function(DIGITS, MAIN, WHICH=0, xrange=NULL, yrange=NULL, print.lab=F, pt.lab="top", country="Nigeria") {
  omar <- par("mar"); omgp <- par("mgp")
  par(mar=c(5, 4, 1, 3))
  par(mgp=c(2.4, .4, 0))
  # Get relevant statistic
  x <- c()
  for(i in 1:nrow(DIGITS$distance.prop)) {
    if(WHICH==0) x <- c(x, DIGITS$distance.prop[i, 1] - ci["Repetition, 5%", ci[1, ]==DIGITS$n[i]])
    if(WHICH==1) x <- c(x, DIGITS$distance.prop[i, 2] - ci["Adjacency, 95%", ci[1, ]==DIGITS$n[i]])
    if(WHICH==2) x <- c(x, sum(DIGITS$distance.prop[i, 3:6]) - ci["Non-adjacency, 5%", ci[1, ]==DIGITS$n[i]])
  }
  y <- log(DIGITS$n)
  if(WHICH==0) xlab="Frequency with which last two digits are identical,\nrelative to 95% confidence bound"
  if(WHICH==1) xlab="Frequency with which distance between last two digits is one,\nrelative to 95% confidence bound"
  if(WHICH==2) xlab="Frequency with which distance between last two digits is more than one,\nrelative to 95% confidence bound"
  if(length(yrange)==0) yrange <- c(min(y, na.rm=T), max(y, na.rm=T))
  if(length(xrange)==0) xrange <- c(min(x, na.rm=T), max(x, na.rm=T))
  plot(NA, xlim=xrange, ylim=yrange, main=MAIN, cex.axis=1, xaxt="n", yaxt="n", xlab=xlab, ylab="")
  if(print.lab==T & country=="Nigeria") title(ylab="Number of polling stations in ward (logged)", xpd=NA)
  if(print.lab==T & country=="Sweden") title(ylab="Number of wards in municipality (logged)", xpd=NA)
  axis(1, seq(round(xrange[1], 1), round(xrange[2], 1), .1), zap(seq(round(xrange[1], 1), round(xrange[2], 1), .1), 1), las=1)
  axis(2, log(c(2, 5, 10, 20, 50, 100, 300, 1000)), c(2, 5, 10, 20, 50, 100, 300, 1000), las=1, tick=F)
  if(country=="Nigeria") cexval <- ward.turnout[2, sapply(DIGITS$id, function(x) which(ward.turnout[1, ]==x))]
  if(country=="Sweden") cexval <- muni.turnout[2, sapply(DIGITS$id, function(x) which(muni.turnout[1, ]==x))]
  # Scale values for size of points to be between .5 and 3
  cexval <- compress(cexval, .05, 4)
  abline(v=0, lty=2)
  if(WHICH==0 | WHICH==2) points(x[x<0], y[x<0], col="black", pch=19, cex=cexval[x<0])
  if(WHICH==1) points(x[x<0], y[x<0], col="grey", pch=19, cex=cexval[x<0])
  points(x[x==0], y[x==0], col="grey", pch=19, cex=cexval[x==0])
  if(WHICH==0 | WHICH==2) points(x[x>0], y[x>0], col="grey", pch=19, cex=cexval[x>0])
  if(WHICH==1) points(x[x>0], y[x>0], col="black", pch=19, cex=cexval[x>0])
  # Legend with regard to point size
  if(pt.lab=="top" & print.lab==T) text(min(xrange), max(yrange), "Point size is proportional to turnout", adj=c(0, .5), cex=1)
  if(pt.lab=="bottom" & print.lab==T) text(min(xrange), min(yrange), "Point size is proportional to turnout", adj=c(0, 0), cex=1)
  # Reset plot parameters
  par(mar=omar); par(mgp=omgp)
}


#############################################################################
###### Load and prepare data
#############################################################################

# SWEDEN

# Final results from 2002 Riksdagsval in Sweden
# Vote returns are reported at ward (valdistrikt) level, within municipalities (kommun), within counties (l�n)
swe.data <- read.csv("Sweden_2002_Riksdagsval.csv")

# Exclude invalid votes reported in separate rows
swe.data <- swe.data[-grep("R", swe.data[,"Distriktskod"]),]

# Attach data
attach(swe.data)

# Swedish data contains the following variables:
# L�n -- county code
# Kommun -- municipality code
# Distriktskod -- ward code
# Namn.p�.valdistrikt -- ward name
# Kommunvalkrets -- code for municipality election
# Antal.r�stber�ttigade -- number of registered voters
# M -- votes for Moderate Party (Moderata samlingspartiet, MSP)
# C -- votes for Centre Party (Centerpartiet)
# FP -- votes for Liberal People's Party (Folkpartiet liberalerna)
# KD -- votes for Christian Democrats (Kristdemokraterna)
# S -- votes for Social Democratic Party (Socialdemokratiska arbetareparti, SAP)
# V -- votes for Left Paty (V�nsterpartiet)
# MP -- votes for Green Party (Milj�partiet de Gr�na)
# Other -- votes for any other parties
# Total -- total votes cast
# "partif�rkortning" means "party abbreviation," "r�stetal osv" means "vote count etc."
# "I vallokal ej r�knande r�ster" means "ineligible votes cast"

# Create a unique municipality ID
muni <- L�n * 1000 + Kommun
# Number of municipalities in data
n.muni <- length(muni)


# NIGERIA

# Results from Plateau State from 2003 Presidential election in Nigeria
# Vote returns are reported at polling station level, within wards, within local government areas (LGAs)
nga.data <- read.csv("Nigeria_2003_Presidential_Plateau_polling_stations.csv")
# A single return sheet was used if no number entered
nga.data[is.na(nga.data[,"sheet"]),"sheet"] <- 1

# Party received zero votes, if no number was entered; similarly for rejected votes
party.names <- c("AD", "ANPP", "APGA", "APLP", "ARP", "BNPP", "CPN", "DA", "GPN", "JP",
 "LDPN", "MDJ", "MMN", "NAC", "NAP", "NCP", "ND", "NDP", "NMMN", "NNPP", "NPC", "NRP",
 "PAC", "PDP", "PMP", "PRP", "PSD", "PSP", "UDP", "UNPP")
for(i in c(party.names, "rejects")) nga.data[is.na(nga.data[,i]),i] <- 0
# There are some figures missing for totalvotes, but these are errors on photocopies
# Actual zero counts for totalvotes were recorded during data entry

# Attach data
attach(nga.data)

# Make sure there are no duplicates or missing for LGA, ward, and polling station codes
cbind(lgacode, wardcode, pstatcode)[which(duplicated(cbind(lgacode, wardcode, pstatcode))),]
nga.data[is.na(lgacode) | is.na(wardcode) | is.na(pstatcode),]

# Create a unique identifier for each ward
lgaward <- lgacode * 1000 + wardcode
# Number of polling stations in data
n.pstat <- length(lgaward)
# Number of wards in data
n.ward <- length(unique(lgaward))
# Number of wards per LGA
wards.in.lga <- cbind(lgacode, lgaward)
wards.in.lga <- wards.in.lga[!duplicated(wards.in.lga), ]
wards.in.lga <- table(wards.in.lga[, 1])
# Ward labels
nga.labels <- cbind(lgaward, gsub("/ ", "/", capwords(paste(as.character(lga), as.character(gsub("/", "/ ", ward)), sep=", "))))
nga.labels <- unique(nga.labels)
# Mark polling stations for which all vote returns end in zero
all.zero <- apply(nga.data[, 9:39] %/% 10 != nga.data[, 9:39] / 10, 1, sum)==0

# Ward level results for Plateau State from 2003 Presidential election in Nigeria
nga.ward.data <- read.csv("Nigeria_2003_Presidential_Plateau_wards.csv")
# For wards with multiple result sheets, keep row summarizing all sheets
nga.ward.data <- nga.ward.data[!duplicated(nga.ward.data[, c("lgacode", "wardcode", "major.town")], fromLast=T), ]

# Add prefix to variables
names(nga.ward.data) <- paste("w.", names(nga.ward.data), sep="")
# Attach data
attach(nga.ward.data)

# Make sure there are no duplicates or missing for LGA and ward codes
cbind(w.lgacode, w.wardcode)[which(duplicated(cbind(w.lgacode, w.wardcode))), ]
nga.ward.data[is.na(w.lgacode) | is.na(w.wardcode), ]

# Create a unique identifier for each ward
w.lgaward <- w.lgacode * 1000 + w.wardcode


# SENEGAL

# Results from 2000 (second round) and 2007 presidential elections in Senegal
# Vote returns are reported at polling station level, within d�partement
sen.2000.data <- read.csv("Senegal_2000_Presidential_last_digits_only.csv")
sen.2007.data <- read.csv("Senegal_2007_Presidential_last_digits_only.csv")

# Attach data
attach(sen.2000.data)
attach(sen.2007.data)


# CHICAGO

# Chicago results from 1924 and 1928 Presidential elections
# Vote returns are reported at precinct level, within wards
chi.1924.data <- read.csv("United_States_1924_Presidential_Chicago.csv", as.is=T)
chi.1928.data <- read.csv("United_States_1928_Presidential_Chicago.csv", as.is=T)

# Separate rows with totals
chi.1924.totals <- chi.1924.data[chi.1924.data[,"PRECINCT"]=="TOTAL", ]
chi.1928.totals <- chi.1928.data[chi.1928.data[,"PRECINCT"]=="TOTAL", ]
# Focus on Republican and Democratic candidates
chi.1924.data <- chi.1924.data[!chi.1924.data[,"PRECINCT"]=="TOTAL", c("WARD", "PRECINCT", "DAVIS", "COOLIDGE")]
chi.1928.data <- chi.1928.data[!chi.1928.data[,"PRECINCT"]=="TOTAL", c("WARD", "PRECINCT", "SMITH", "HOOVER")]

# Add suffix to variables
names(chi.1924.data)[1:2] <- paste(names(chi.1924.data)[1:2], ".1924", sep="")
names(chi.1928.data)[1:2] <- paste(names(chi.1928.data)[1:2], ".1928", sep="")

# Attach data
attach(chi.1924.data)
attach(chi.1928.data)

# Number of precincts in data
n.precincts.1924 <- length(PRECINCT.1924)
n.precincts.1928 <- length(PRECINCT.1928)


#############################################################################
###### Run digit analysis
#############################################################################

# SWEDEN

# Analyze last digits for S, M, registered voters, and total votes
swe.SAP.digits <- digit.test(S, muni)
swe.MSP.digits <- digit.test(M, muni)
swe.reg.digits <- digit.test(Antal.r�stber�ttigade, muni)
swe.tot.digits <- digit.test(Total, muni)
# Analyze last digits for all columns with sufficient number of voters
swe.mul.digits <- digit.test(c(S, M, Antal.r�stber�ttigade, Total), rep(muni, 4))


# NIGERIA

# Analyze last digits for PDP, ANPP, registered voters, and total votes
nga.PDP.digits <- digit.test(PDP, lgaward)
nga.ANPP.digits <- digit.test(ANPP, lgaward)
nga.reg.digits <- digit.test(regvoters, lgaward)
nga.tot.digits <- digit.test(totalvotes, lgaward)
# Analyze last digits for all columns with sufficient number of voters
nga.mul.digits <- digit.test(c(PDP, ANPP, regvoters, totalvotes), rep(lgaward, 4))
# Exclude small counts for ANPP
nga.ANPP.9.digits <- digit.test(ANPP[ANPP>9], lgaward[ANPP>9])
nga.ANPP.99.digits <- digit.test(ANPP[ANPP>99], lgaward[ANPP>99])
# Exclude polling stations where all vote returns end in zero
nga.PDP.nozero.digits <- digit.test(PDP[!all.zero], lgaward[!all.zero])
nga.ANPP.nozero.digits <- digit.test(ANPP[!all.zero], lgaward[!all.zero])
nga.reg.nozero.digits <- digit.test(regvoters[!all.zero], lgaward[!all.zero])
nga.tot.nozero.digits <- digit.test(totalvotes[!all.zero], lgaward[!all.zero])
nga.mul.nozero.digits <- digit.test(c(PDP[!all.zero], ANPP[!all.zero], regvoters[!all.zero], totalvotes[!all.zero]), rep(lgaward[!all.zero], 4))


# SENEGAL

# Analyze last digits for election winner Wade and other columns with relatively large expected values
sen.2000.Wade.digits <- digit.test(WADE, c(DEPARTEMENT))
sen.2000.mul.digits <- digit.test(c(DIOUF, INSCRITS), rep(c(DEPARTEMENT), 2))
sen.2007.Wade.digits <- digit.test(Wade, c(D�partement))
sen.2007.mul.digits <- digit.test(c(Seck, Dieng), rep(c(D�partement), 2))
# Exclude small counts
sen.2000.Wade.99.digits <- digit.test(c(WADE[WADE.large]), c(DEPARTEMENT[WADE.large]))
sen.2000.mul.99.digits <- digit.test(c(DIOUF[DIOUF.large], INSCRITS[INSCRITS.large]), c(DEPARTEMENT[DIOUF.large], DEPARTEMENT[INSCRITS.large]))
sen.2007.Wade.99.digits <- digit.test(c(Wade[Wade.large]), c(D�partement[Wade.large]))
sen.2007.mul.99.digits <- digit.test(c(Seck[Seck.large], Dieng[Dieng.large]), c(D�partement[Seck.large], D�partement[Dieng.large]))


# CHICAGO

# Analyze last digits for Davis and Coolidge (1924), and Smith and Hoover (1928)
chi.1924.dav.digits <- digit.test(DAVIS, WARD.1924)
chi.1924.coo.digits <- digit.test(COOLIDGE, WARD.1924)
chi.1924.mul.digits <- digit.test(c(DAVIS, COOLIDGE), rep(WARD.1924, 2))
chi.1928.smi.digits <- digit.test(SMITH, WARD.1928)
chi.1928.hoo.digits <- digit.test(HOOVER, WARD.1928)
chi.1928.mul.digits <- digit.test(c(SMITH, HOOVER), rep(WARD.1928, 2))


#############################################################################
###### Run simulations for confidence intervals
#############################################################################

# Function to get confidence intervals
sims.fct <- function(NSIMS=10000, VAL){
  sims <- c()
    for(j in 1:NSIMS) {
      draw <- sample(0:9, VAL, replace=T)
      prop <- sapply(0:9, function(x) length(draw[draw==x]) / VAL)
      sims <- rbind(sims, c(min(prop), max(prop), prop[1], sum(prop[2:3]), sum(prop[4:10])))
    }
  out <- c(VAL, quantile(sims[,1], probs=.025), quantile(sims[,2], probs=c(.975, .95)), quantile(sims[,3], probs=c(.025, .975, .05)), quantile(sims[,4], probs=c(.025, .975, .95)), quantile(sims[,5], probs=c(.025, .975, .05)))
  # Non-adjacency refers to pairs of non-repeating, non-adjacent numerals
  names(out) <- c("Number of observations", "Proportion, 2.5%", "Proportion, 97.5%", "Proportion, 95%", "Repetition, 2.5%", "Repetition, 97.5%", "Repetition, 5%", "Adjacency, 2.5%", "Adjacency, 97.5%", "Adjacency, 95%", "Non-adjacency, 2.5%", "Non-adjacency, 97.5%", "Non-adjacency, 5%")
  return(out)
}
sims.vec <- Vectorize(sims.fct, vectorize.args="VAL")

# Run simulations for all relevant numbers of observations
n.obs <- unique(c(swe.SAP.digits$n, swe.SAP.digits$all.n, swe.MSP.digits$n, swe.MSP.digits$all.n, swe.reg.digits$n, swe.reg.digits$all.n, swe.tot.digits$n, swe.tot.digits$all.n, swe.mul.digits$n, swe.mul.digits$all.n,
                  nga.PDP.digits$n, nga.PDP.digits$all.n, nga.ANPP.digits$n, nga.ANPP.digits$all.n, nga.reg.digits$n, nga.reg.digits$all.n, nga.tot.digits$n, nga.tot.digits$all.n, nga.mul.digits$n, nga.mul.digits$all.n,
                  nga.PDP.nozero.digits$n, nga.PDP.nozero.digits$all.n, nga.ANPP.nozero.digits$n, nga.ANPP.nozero.digits$all.n, nga.reg.nozero.digits$n, nga.reg.nozero.digits$all.n, nga.tot.nozero.digits$n, nga.tot.nozero.digits$all.n, nga.mul.nozero.digits$n, nga.mul.nozero.digits$all.n,
                  nga.ANPP.9.digits$n, nga.ANPP.9.digits$all.n, nga.ANPP.99.digits$n, nga.ANPP.99.digits$all.n,
                  sen.2000.Wade.digits$all.n, sen.2000.mul.digits$all.n, sen.2007.Wade.digits$all.n, sen.2007.mul.digits$all.n,
                  sen.2000.Wade.99.digits$all.n, sen.2000.mul.99.digits$all.n, sen.2007.Wade.99.digits$all.n, sen.2007.mul.99.digits$all.n,
                  chi.1924.dav.digits$n, chi.1924.dav.digits$all.n, chi.1924.coo.digits$n, chi.1924.coo.digits$all.n, chi.1924.mul.digits$n, chi.1924.mul.digits$all.n,
                  chi.1928.smi.digits$n, chi.1928.smi.digits$all.n, chi.1928.hoo.digits$n, chi.1928.hoo.digits$all.n, chi.1928.mul.digits$n, chi.1928.mul.digits$all.n))
n.obs <- n.obs[order(n.obs)]
set.seed(26022011)
ci <- sims.vec(NSIMS=10000, VAL=n.obs)


#############################################################################
###### Graphs for data from Sweden
#############################################################################

# Turnout by municipality
distrikt.turnout <- Total / Antal.r�stber�ttigade
muni.turnout <- sapply(unique(muni), function(x) c(x, sum(Total[muni==x]) / sum(Antal.r�stber�ttigade[muni==x])))

# Proportions for all municipalities
postscript("SWE_Prop.eps", width=8, height=8, bg="white", paper="special", horizontal=F, family="ComputerModern", encoding="TeXtext.enc")
  par(mfrow=c(2, 2))
  par(mar=c(2, 3, 2, 1))
  prop.plot(swe.SAP.digits$all.prop, ci.l=ci["Proportion, 2.5%", ci[1, ]==swe.SAP.digits$all.n], ci.u=ci["Proportion, 97.5%", ci[1, ]==swe.SAP.digits$all.n], MAIN="SAP results")
  prop.plot(swe.MSP.digits$all.prop, ci.l=ci["Proportion, 2.5%", ci[1, ]==swe.MSP.digits$all.n], ci.u=ci["Proportion, 97.5%", ci[1, ]==swe.MSP.digits$all.n], MAIN="MSP results")
  prop.plot(swe.reg.digits$all.prop, ci.l=ci["Proportion, 2.5%", ci[1, ]==swe.reg.digits$all.n], ci.u=ci["Proportion, 97.5%", ci[1, ]==swe.reg.digits$all.n], MAIN="Number of registered voters")
  prop.plot(swe.tot.digits$all.prop, ci.l=ci["Proportion, 2.5%", ci[1, ]==swe.tot.digits$all.n], ci.u=ci["Proportion, 97.5%", ci[1, ]==swe.tot.digits$all.n], MAIN="Total votes")
dev.off()

# Repetition, adjacency, and non-adjacency in last two digits across municipalities
# Look at subset of size equal to number of wards in Nigeria dataset, for easier comparison
set.seed(26022011)
swe.sel <- sample(unique(muni), n.ward)
swe.mul.sel.digits <- digit.test(c(S[muni %in% swe.sel], M[muni %in% swe.sel], Antal.r�stber�ttigade[muni %in% swe.sel], Total[muni %in% swe.sel]), rep(muni[muni %in% swe.sel], 4))

postscript("SWE_Distance.eps", width=7, height=10, bg="white", paper="special", horizontal=F, family="ComputerModern", encoding="TeXtext.enc")
  par(mfrow=c(3, 1))
  distance.plot(swe.mul.sel.digits, MAIN="", xrange=c(-.12, .41), yrange=c(log(min(swe.mul.sel.digits$n, nga.mul.digits$n)), log(max(swe.mul.sel.digits$n, nga.mul.digits$n))), WHICH=0, country="Sweden")
  distance.plot(swe.mul.sel.digits, MAIN="", xrange=c(-.41, .12), yrange=c(log(min(swe.mul.sel.digits$n, nga.mul.digits$n)), log(max(swe.mul.sel.digits$n, nga.mul.digits$n))), WHICH=1, print.lab=T, country="Sweden")
  distance.plot(swe.mul.sel.digits, MAIN="", xrange=c(-.12, .41), yrange=c(log(min(swe.mul.sel.digits$n, nga.mul.digits$n)), log(max(swe.mul.sel.digits$n, nga.mul.digits$n))), WHICH=2, country="Sweden")
dev.off()


#############################################################################
###### Graphs and additional analysis for data from Nigeria
#############################################################################

# Turnout by polling stations and ward
ps.turnout <- totalvotes / regvoters
ward.turnout <- sapply(unique(lgaward), function(x) c(x, sum(totalvotes[lgaward==x & !is.na(totalvotes) & !is.na(regvoters)]) / sum(regvoters[lgaward==x & !is.na(totalvotes) & !is.na(regvoters)])))

# Identify wards with at least one suspicious digit proportion across multiple vote columns
suspicious <- c()
for(i in 1:n.ward) suspicious <- c(suspicious, any(nga.mul.digits$prop[i, ] > ci["Proportion, 95%", ci[1, ]==nga.mul.digits$n[i]]))
suspicious.digit <- nga.mul.digits$id[suspicious]

# Identify wards with at least one suspicious digit proportion in total vote column
suspicious.total <- c()
for(i in 1:n.ward) suspicious.total <- c(suspicious.total, any(nga.tot.digits$prop[i, ] > ci["Proportion, 95%", ci[1, ]==nga.tot.digits$n[i]]))
suspicious.digit.total <- nga.tot.digits$id[suspicious.total]

# Can also identify suspicious wards as those for which chi-squared test of equifrequent last digits produces significant result
suspicious.digits <- nga.mul.digits$id[nga.mul.digits$pval < .05]
# Wards for which chi-squared test of distribution of last and second-to-last digit produces significant result
suspicious.distance <- nga.mul.digits$id[nga.mul.digits$distance.pval < .05]
# Wards for which either chi-squared test produces significant result
suspicious.cases <- nga.mul.digits$id[nga.mul.digits$pval < .05 | nga.mul.digits$distance.pval < .05]

# Result does depend on zeros, although remaining numerals also vary substantially more than in case of Sweden
1 - pchisq(sum((nga.mul.digits$all.count[2:10] - (1/9) * sum(nga.mul.digits$all.count[2:10]))^2 / ((1/9) * sum(nga.mul.digits$all.count[2:10]))), 8)
1 - pchisq(sum((swe.mul.digits$all.count[2:10] - (1/9) * sum(swe.mul.digits$all.count[2:10]))^2 / ((1/9) * sum(swe.mul.digits$all.count[2:10]))), 8)

# Last-digit proportions for all wards
postscript("NGA_Prop.eps", width=8, height=8, bg="white", paper="special", horizontal=F, family="ComputerModern", encoding="TeXtext.enc")
  par(mfrow=c(2, 2))
  par(mar=c(2, 3, 2, 1))
  prop.plot(nga.PDP.digits$all.prop, ci.l=ci["Proportion, 2.5%", ci[1, ]==nga.PDP.digits$all.n], ci.u=ci["Proportion, 97.5%", ci[1, ]==nga.PDP.digits$all.n], MAIN="PDP results")
  prop.plot(nga.ANPP.digits$all.prop, ci.l=ci["Proportion, 2.5%", ci[1, ]==nga.ANPP.digits$all.n], ci.u=ci["Proportion, 97.5%", ci[1, ]==nga.ANPP.digits$all.n], MAIN="ANPP results")
  prop.plot(nga.reg.digits$all.prop, ci.l=ci["Proportion, 2.5%", ci[1, ]==nga.reg.digits$all.n], ci.u=ci["Proportion, 97.5%", ci[1, ]==nga.reg.digits$all.n], MAIN="Number of registered voters")
  prop.plot(nga.tot.digits$all.prop, ci.l=ci["Proportion, 2.5%", ci[1, ]==nga.tot.digits$all.n], ci.u=ci["Proportion, 97.5%", ci[1, ]==nga.tot.digits$all.n], MAIN="Total votes")
dev.off()

# Check if results persist if we exclude small ANPP counts
prop.plot(nga.ANPP.9.digits$all.prop, ci.l=ci["Proportion, 2.5%", ci[1, ]==nga.ANPP.9.digits$all.n], ci.u=ci["Proportion, 97.5%", ci[1, ]==nga.ANPP.9.digits$all.n], MAIN="ANPP results")
prop.plot(nga.ANPP.99.digits$all.prop, ci.l=ci["Proportion, 2.5%", ci[1, ]==nga.ANPP.99.digits$all.n], ci.u=ci["Proportion, 97.5%", ci[1, ]==nga.ANPP.99.digits$all.n], MAIN="ANPP results")

# Calculate addition errors to see if number of 0s indicates laziness
# Error between party votes and total votes
recorded.votes <- apply(nga.data[, 9:39], 1, function(x) sum(x, na.rm=T)) # Votes for parties + rejects
error <- totalvotes - recorded.votes
# Wards with any addition errors
all.error.cases <- unique(lgaward[error!=0 & !is.na(error)])
# Wards with addition errors in proportion of polling stations given by pct
pct <- .2
error.cases <- c()
for(i in unique(lgaward)) if(length(lgaward[error!=0 & !is.na(error) & lgaward==i]) / table(lgaward)[paste(i)] > pct) error.cases <- c(error.cases, i)

# All party vote columns (plus rejected votes) end in zero for 19 out of 2531 polling stations
nga.data[all.zero, ]

# Plot proportions of last digits for randomly selected set of wards
# Draw sample of wards within strata to accurately reflect complete data
# Strata defined by error.cases and suspicious.digit, because plot highlights individual digit proportions that are suspicious
set.seed(26022011)
strata1 <- unique(lgaward)[(unique(lgaward) %in% suspicious.digit) & (unique(lgaward) %in% error.cases)]
strata2 <- unique(lgaward)[!(unique(lgaward) %in% suspicious.digit) & (unique(lgaward) %in% error.cases)]
strata3 <- unique(lgaward)[(unique(lgaward) %in% suspicious.digit) & !(unique(lgaward) %in% error.cases)]
strata4 <- unique(lgaward)[!(unique(lgaward) %in% suspicious.digit) & !(unique(lgaward) %in% error.cases)]
sel <- c(sample(strata1, round(67 * length(strata1) / n.ward)),
         sample(strata2, round(67 * length(strata2) / n.ward)),
         sample(strata3, round(67 * length(strata3) / n.ward)),
         sample(strata4, 67 - round(67 * length(strata1) / n.ward) - round(67 * length(strata2) / n.ward) - round(67 * length(strata3) / n.ward)))
nga.mul.sel.digits <- digit.test(c(PDP[lgaward %in% sel], ANPP[lgaward %in% sel], regvoters[lgaward %in% sel], totalvotes[lgaward %in% sel]), rep(lgaward[lgaward %in% sel], 4))

# Ward-level proportion plots for sample of wards, highlighting wards where numbers don't add up
postscript("NGA_Prop_Many.eps", width=10, height=7, bg="white", paper="special", horizontal=F, family="ComputerModern", encoding="TeXtext.enc")
  prop.plot.many(nga.mul.sel.digits, wrap("Proportions of last digits for multiple return sheet columns, by ward, sorted by number of polling stations, with unadjusted 95% confidence bound", 40), highlight=error.cases, highlight.col="black")
dev.off()

# Ward-level plot of deviations in proportions, for all wards
# Change some labels for plotting
nga.labels.prop.all <- nga.labels
nga.labels.prop.all[nga.labels.prop.all[, 2]=="Bokkos, Kwatas", 2] <- "Bokkos, Kwat."
nga.labels.prop.all[nga.labels.prop.all[, 2]=="Jos-North, Jos Jarawa", 2] <- "Jos-North, Ja."
postscript("NGA_Prop_All.eps", width=10, height=7, bg="white", paper="special", horizontal=F, family="ComputerModern", encoding="TeXtext.enc")
  prop.plot.all(nga.mul.digits, nga.labels.prop.all, lab.thold=0, lab.dist=.13, spread.tol=.006)
dev.off()

# Look at distribution of p-values from chi-square test for equifrequent last digits
postscript("NGA_Pvals.eps", width=10.5, height=7, bg="white", paper="special", horizontal=F, family="ComputerModern", encoding="TeXtext.enc")
  par(mfrow=c(1, 2))
  pval.plot(swe.mul.digits$pval, main="Sweden")
  pval.plot(nga.mul.digits$pval, main="Nigeria")
dev.off()

# Repetition, adjacency, and non-adjacency in last two digits across municipalities
postscript("NGA_Distance.eps", width=7, height=10, bg="white", paper="special", horizontal=F, family="ComputerModern", encoding="TeXtext.enc")
  par(mfrow=c(3, 1))
  distance.plot(nga.mul.digits, MAIN="", xrange=c(-.12, .41), yrange=c(log(min(swe.mul.sel.digits$n, nga.mul.digits$n)), log(max(swe.mul.sel.digits$n, nga.mul.digits$n))), WHICH=0)
  distance.plot(nga.mul.digits, MAIN="", xrange=c(-.41, .12), yrange=c(log(min(swe.mul.sel.digits$n, nga.mul.digits$n)), log(max(swe.mul.sel.digits$n, nga.mul.digits$n))), WHICH=1, print.lab=T)
  distance.plot(nga.mul.digits, MAIN="", xrange=c(-.12, .41), yrange=c(log(min(swe.mul.sel.digits$n, nga.mul.digits$n)), log(max(swe.mul.sel.digits$n, nga.mul.digits$n))), WHICH=2)
dev.off()
# Number of wards that have suspiciously few non-adjacent and non-repeating digit pairs
sum(sapply(1:n.ward, function(x) sum(nga.mul.digits$distance.prop[x, 3:6]) < ci["Non-adjacency, 5%", ci[1, ]==nga.mul.digits$n[x]]))

# Merge ward-level data for regression analysis
ward.mat <- merge(cbind(t(ward.turnout), table(lgaward), ward.turnout[1, ] %in% error.cases, ward.turnout[1, ] %in% suspicious.cases, ward.turnout[1, ] %in% suspicious.digits, ward.turnout[1, ] %in% suspicious.digit, ward.turnout[1, ] %in% suspicious.digit.total, ward.turnout[1, ] %in% suspicious.distance), cbind(w.lgaward, w.major.town), by.x=1, by.y=1)
names(ward.mat)[1:9] <- c("id", "turnout", "ward.size", "error.cases", "suspicious.cases", "suspicious.digits", "suspicious.digit", "suspicious.digit.total", "suspicious.distance")

# Are suspicious wards in urban or rural areas?
# Coding rule for urban-rural indicator: coded 1 if ward lies in LGA synonymous with "major town" as identified by Nigeria Congress Online 2003

# Urban areas less likely to appear suspicious
summary(glm(ward.mat[, "suspicious.cases"] ~ ward.mat[, "w.major.town"], family=binomial))

# Ward size does not generally correlate with irregularities in a significant way
summary(glm(ward.mat[, "suspicious.cases"] ~ ward.mat[, "ward.size"], family=binomial))
summary(glm(ward.mat[, "suspicious.digits"] ~ ward.mat[, "ward.size"], family=binomial))
# Positive correlation with suspicious deviations on single digits, because power of single-digit test is correlated with ward size
summary(glm(ward.mat[, "suspicious.digit"] ~ ward.mat[, "ward.size"], family=binomial))

# No significant role for turnout
summary(glm(ward.mat[, "suspicious.cases"] ~ ward.mat[, "turnout"], family=binomial))
summary(glm(ward.mat[, "suspicious.digit"] ~ ward.mat[, "turnout"], family=binomial))
summary(glm(ward.mat[, "suspicious.digits"] ~ ward.mat[, "turnout"], family=binomial))

# Addition error does not correlate with suspicious digits, except in total vote counts
summary(glm(ward.mat[, "suspicious.cases"] ~ ward.mat[, "error.cases"], family=binomial))
summary(glm(ward.mat[, "suspicious.digit"] ~ ward.mat[, "error.cases"], family=binomial))
summary(glm(ward.mat[, "suspicious.digit.total"] ~ ward.mat[, "error.cases"], family=binomial))


#############################################################################
###### Graphs and additional analysis for data from Senegal
#############################################################################

# Proportions for all polling stations, excluding single- and double-digit counts
postscript("SEN_2000_Prop.eps", width=8, height=4, bg="white", paper="special", horizontal=F, family="ComputerModern", encoding="TeXtext.enc")
  par(mfrow=c(1, 2))
  par(mar=c(2, 3, 2, 1))
  prop.plot(sen.2000.Wade.99.digits$all.prop, ci.l=ci["Proportion, 2.5%", ci[1, ]==sen.2000.Wade.99.digits$all.n], ci.u=ci["Proportion, 97.5%", ci[1, ]==sen.2000.Wade.99.digits$all.n], MAIN="Wade")
  prop.plot(sen.2000.mul.99.digits$all.prop, ci.l=ci["Proportion, 2.5%", ci[1, ]==sen.2000.mul.99.digits$all.n], ci.u=ci["Proportion, 97.5%", ci[1, ]==sen.2000.mul.99.digits$all.n], MAIN="Other columns")
dev.off()
postscript("SEN_2007_Prop.eps", width=8, height=4, bg="white", paper="special", horizontal=F, family="ComputerModern", encoding="TeXtext.enc")
  par(mfrow=c(1, 2))
  par(mar=c(2, 3, 2, 1))
  prop.plot(sen.2007.Wade.99.digits$all.prop, ci.l=ci["Proportion, 2.5%", ci[1, ]==sen.2007.Wade.99.digits$all.n], ci.u=ci["Proportion, 97.5%", ci[1, ]==sen.2007.Wade.99.digits$all.n], MAIN="Wade")
  prop.plot(sen.2007.mul.99.digits$all.prop, ci.l=ci["Proportion, 2.5%", ci[1, ]==sen.2007.mul.99.digits$all.n], ci.u=ci["Proportion, 97.5%", ci[1, ]==sen.2007.mul.99.digits$all.n], MAIN="Other columns")
dev.off()

# Share of single- and double-digit vote counts, in comparison to Nigeria and Sweden
# Senegal, 2000
(sum(!is.na(c(WADE, DIOUF, INSCRITS))) - sum(c(WADE.large, DIOUF.large, INSCRITS.large), na.rm=T)) / sum(!is.na(c(WADE, DIOUF, INSCRITS)))
# Senegal, 2007
(sum(!is.na(c(Wade, Seck, Dieng))) - sum(c(Wade.large, Seck.large, Dieng.large), na.rm=T)) / sum(!is.na(c(Wade, Seck, Dieng)))
# Nigeria, 2003
(sum(!is.na(c(PDP, ANPP, regvoters, totalvotes))) - sum(c(PDP, ANPP, regvoters, totalvotes)>99, na.rm=T)) / sum(!is.na(c(PDP, ANPP, regvoters, totalvotes)))
# Sweden, 2002
(sum(!is.na(c(S, M, Antal.r�stber�ttigade, Total))) - sum(c(S, M, Antal.r�stber�ttigade, Total)>99, na.rm=T)) / sum(!is.na(c(S, M, Antal.r�stber�ttigade, Total)))


#############################################################################
###### Additional analysis for data from Chicago
#############################################################################

# Margins of victory
sum(chi.1924.totals[, "COOLIDGE"]) / sum(as.numeric(unlist(chi.1924.totals[, 5:11])))
sum(chi.1924.totals[, "DAVIS"]) / sum(as.numeric(unlist(chi.1924.totals[, 5:11])))
sum(chi.1928.totals[, "HOOVER"]) / sum(as.numeric(unlist(chi.1928.totals[, 5:9])))
sum(chi.1928.totals[, "SMITH"]) / sum(as.numeric(unlist(chi.1928.totals[, 5:9])))

# Save workspace
save.image("Beber_Scacco_Election_Fraud_replication.RData")
